# Always print this out before your assignment
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods
## [7] base
##
## other attached packages:
## [1] knitr_1.36
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.28 R6_2.5.1 jsonlite_1.7.2 magrittr_2.0.1
## [5] evaluate_0.14 rlang_0.4.12 stringi_1.7.5 jquerylib_0.1.4
## [9] bslib_0.3.1 rmarkdown_2.11 tools_4.1.2 stringr_1.4.0
## [13] xfun_0.28 yaml_2.2.1 fastmap_1.1.0 compiler_4.1.2
## [17] htmltools_0.5.2 sass_0.4.0
getwd()
## [1] "C:/Users/Daniel/Documents/GitHub/MGSC-310-Project"
library("here")
library("tidyverse")
library("forcats")
library("rsample")
library("ggplot2")
library("ggmap")
library("dplyr")
library("lubridate")
library("xgboost")
library('DiagrammeR')
library('Matrix')
crashes <- read.csv(here("datasets", "US_Accidents_Dec20_updated.csv"))
summary(crashes)
## ID Severity Start_Time
## Length:1516064 Min. :1.000 Length:1516064
## Class :character 1st Qu.:2.000 Class :character
## Mode :character Median :2.000 Mode :character
## Mean :2.239
## 3rd Qu.:2.000
## Max. :4.000
##
## End_Time Start_Lat Start_Lng End_Lat
## Length:1516064 Min. :24.57 Min. :-124.50 Min. :24.57
## Class :character 1st Qu.:33.85 1st Qu.:-118.21 1st Qu.:33.85
## Mode :character Median :37.35 Median : -94.38 Median :37.35
## Mean :36.90 Mean : -98.60 Mean :36.90
## 3rd Qu.:40.73 3rd Qu.: -80.87 3rd Qu.:40.73
## Max. :49.00 Max. : -67.11 Max. :49.08
##
## End_Lng Distance.mi. Description
## Min. :-124.50 Min. : 0.0000 Length:1516064
## 1st Qu.:-118.21 1st Qu.: 0.0000 Class :character
## Median : -94.38 Median : 0.1780 Mode :character
## Mean : -98.60 Mean : 0.5873
## 3rd Qu.: -80.87 3rd Qu.: 0.5940
## Max. : -67.11 Max. :155.1860
##
## Number Street Side
## Min. : 0 Length:1516064 Length:1516064
## 1st Qu.: 1212 Class :character Class :character
## Median : 4000 Mode :character Mode :character
## Mean : 8908
## 3rd Qu.: 10100
## Max. :9999997
## NA's :1046095
## City County State
## Length:1516064 Length:1516064 Length:1516064
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Zipcode Country Timezone
## Length:1516064 Length:1516064 Length:1516064
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Airport_Code Weather_Timestamp Temperature.F.
## Length:1516064 Length:1516064 Min. :-89.00
## Class :character Class :character 1st Qu.: 47.00
## Mode :character Mode :character Median : 61.00
## Mean : 59.58
## 3rd Qu.: 73.00
## Max. :170.60
## NA's :43033
## Wind_Chill.F. Humidity... Pressure.in. Visibility.mi.
## Min. :-89.0 Min. : 1.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 40.8 1st Qu.: 48.00 1st Qu.:29.44 1st Qu.: 10.00
## Median : 57.0 Median : 68.00 Median :29.88 Median : 10.00
## Mean : 55.1 Mean : 64.66 Mean :29.55 Mean : 9.13
## 3rd Qu.: 71.0 3rd Qu.: 84.00 3rd Qu.:30.04 3rd Qu.: 10.00
## Max. :113.0 Max. :100.00 Max. :58.04 Max. :140.00
## NA's :449316 NA's :45509 NA's :36274 NA's :44211
## Wind_Direction Wind_Speed.mph. Precipitation.in.
## Length:1516064 Min. : 0.00 Min. : 0
## Class :character 1st Qu.: 4.60 1st Qu.: 0
## Mode :character Median : 7.00 Median : 0
## Mean : 7.63 Mean : 0
## 3rd Qu.: 10.40 3rd Qu.: 0
## Max. :984.00 Max. :24
## NA's :128862 NA's :510549
## Weather_Condition Amenity Bump
## Length:1516064 Length:1516064 Length:1516064
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Crossing Give_Way Junction
## Length:1516064 Length:1516064 Length:1516064
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## No_Exit Railway Roundabout
## Length:1516064 Length:1516064 Length:1516064
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Station Stop Traffic_Calming
## Length:1516064 Length:1516064 Length:1516064
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Traffic_Signal Turning_Loop Sunrise_Sunset
## Length:1516064 Length:1516064 Length:1516064
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Civil_Twilight Nautical_Twilight Astronomical_Twilight
## Length:1516064 Length:1516064 Length:1516064
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
crashes_clean <-
crashes %>%
mutate(Start_Time_Clean = ymd_hms(Start_Time),
End_Time_Clean = ymd_hms(End_Time),
Weather_Condition_Clean = as.factor(Weather_Condition),
Wind_Direction_Clean = as.factor(Wind_Direction),
Weather_Timestamp_Clean = ymd_hms(Weather_Timestamp),
State_Clean = as.factor(State),
County_Clean = as.factor(County),
City_Clean = as.factor(City),
Side_Clean = as.factor(Side))
summary(crashes_clean)
## ID Severity Start_Time
## Length:1516064 Min. :1.000 Length:1516064
## Class :character 1st Qu.:2.000 Class :character
## Mode :character Median :2.000 Mode :character
## Mean :2.239
## 3rd Qu.:2.000
## Max. :4.000
##
## End_Time Start_Lat Start_Lng End_Lat
## Length:1516064 Min. :24.57 Min. :-124.50 Min. :24.57
## Class :character 1st Qu.:33.85 1st Qu.:-118.21 1st Qu.:33.85
## Mode :character Median :37.35 Median : -94.38 Median :37.35
## Mean :36.90 Mean : -98.60 Mean :36.90
## 3rd Qu.:40.73 3rd Qu.: -80.87 3rd Qu.:40.73
## Max. :49.00 Max. : -67.11 Max. :49.08
##
## End_Lng Distance.mi. Description
## Min. :-124.50 Min. : 0.0000 Length:1516064
## 1st Qu.:-118.21 1st Qu.: 0.0000 Class :character
## Median : -94.38 Median : 0.1780 Mode :character
## Mean : -98.60 Mean : 0.5873
## 3rd Qu.: -80.87 3rd Qu.: 0.5940
## Max. : -67.11 Max. :155.1860
##
## Number Street Side
## Min. : 0 Length:1516064 Length:1516064
## 1st Qu.: 1212 Class :character Class :character
## Median : 4000 Mode :character Mode :character
## Mean : 8908
## 3rd Qu.: 10100
## Max. :9999997
## NA's :1046095
## City County State
## Length:1516064 Length:1516064 Length:1516064
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Zipcode Country Timezone
## Length:1516064 Length:1516064 Length:1516064
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Airport_Code Weather_Timestamp Temperature.F.
## Length:1516064 Length:1516064 Min. :-89.00
## Class :character Class :character 1st Qu.: 47.00
## Mode :character Mode :character Median : 61.00
## Mean : 59.58
## 3rd Qu.: 73.00
## Max. :170.60
## NA's :43033
## Wind_Chill.F. Humidity... Pressure.in. Visibility.mi.
## Min. :-89.0 Min. : 1.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 40.8 1st Qu.: 48.00 1st Qu.:29.44 1st Qu.: 10.00
## Median : 57.0 Median : 68.00 Median :29.88 Median : 10.00
## Mean : 55.1 Mean : 64.66 Mean :29.55 Mean : 9.13
## 3rd Qu.: 71.0 3rd Qu.: 84.00 3rd Qu.:30.04 3rd Qu.: 10.00
## Max. :113.0 Max. :100.00 Max. :58.04 Max. :140.00
## NA's :449316 NA's :45509 NA's :36274 NA's :44211
## Wind_Direction Wind_Speed.mph. Precipitation.in.
## Length:1516064 Min. : 0.00 Min. : 0
## Class :character 1st Qu.: 4.60 1st Qu.: 0
## Mode :character Median : 7.00 Median : 0
## Mean : 7.63 Mean : 0
## 3rd Qu.: 10.40 3rd Qu.: 0
## Max. :984.00 Max. :24
## NA's :128862 NA's :510549
## Weather_Condition Amenity Bump
## Length:1516064 Length:1516064 Length:1516064
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Crossing Give_Way Junction
## Length:1516064 Length:1516064 Length:1516064
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## No_Exit Railway Roundabout
## Length:1516064 Length:1516064 Length:1516064
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Station Stop Traffic_Calming
## Length:1516064 Length:1516064 Length:1516064
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Traffic_Signal Turning_Loop Sunrise_Sunset
## Length:1516064 Length:1516064 Length:1516064
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Civil_Twilight Nautical_Twilight Astronomical_Twilight
## Length:1516064 Length:1516064 Length:1516064
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Start_Time_Clean End_Time_Clean
## Min. :2016-02-08 00:37:08 Min. :2016-02-08 06:37:08
## 1st Qu.:2018-07-17 14:41:25 1st Qu.:2018-07-17 17:13:14
## Median :2020-01-24 11:16:33 Median :2020-01-24 13:38:15
## Mean :2019-07-15 07:01:48 Mean :2019-07-15 11:42:20
## 3rd Qu.:2020-10-22 13:01:30 3rd Qu.:2020-10-22 17:50:19
## Max. :2020-12-31 23:28:56 Max. :2021-01-01 00:00:00
##
## Weather_Condition_Clean Wind_Direction_Clean
## Fair :465252 CALM :202870
## Mostly Cloudy:193595 Calm : 79192
## Clear :180223 WNW : 77743
## Cloudy :161291 NW : 75810
## Partly Cloudy:133102 W : 72059
## Overcast : 87853 SSW : 69901
## (Other) :294748 (Other):938489
## Weather_Timestamp_Clean State_Clean
## Min. :2016-02-08 00:53:00 CA :448833
## 1st Qu.:2018-07-10 10:55:30 FL :153007
## Median :2020-01-22 05:53:00 OR : 87484
## Mean :2019-07-12 00:02:11 TX : 75142
## 3rd Qu.:2020-10-21 04:54:00 NY : 60974
## Max. :2020-12-31 23:35:00 MN : 52345
## NA's :30264 (Other):638279
## County_Clean City_Clean Side_Clean
## Los Angeles : 138819 Los Angeles: 39984 L: 221502
## Orange : 49833 Miami : 36233 R:1294562
## Miami-Dade : 47382 Charlotte : 22203
## San Bernardino: 30251 Houston : 20843
## San Diego : 26623 Dallas : 19497
## Sacramento : 25941 Sacramento : 18431
## (Other) :1197215 (Other) :1358873
qmplot(Start_Lng, Start_Lat, data = crashes, maptype = "toner-lite", color = factor(Severity))

qmplot(End_Lng, End_Lat, data = crashes, maptype = "toner-lite", color = factor(Severity))

ggplot(data = crashes_clean, aes(x = Severity)) + geom_histogram()

ggplot(data = crashes_clean, aes(x = Start_Time_Clean, y = Distance.mi.)) + geom_line()

crashes_clean %>%
ggplot(aes(Start_Time_Clean)) + geom_histogram(binwidth = 86400)

crashes_clean %>%
mutate(wday = wday(Start_Time, label = TRUE)) %>%
ggplot(aes(x = wday)) +
geom_bar()

crashes_split <- initial_split(crashes_clean, prop = 0.75)
crashes_train <- training(crashes_split)
crashes_test <- testing(crashes_split)
crashes_xgb <- crashes %>%
select(-ID,
-Description,
-Street,
-Weather_Timestamp,
-Number,
-Airport_Code,
-Country,
-Turning_Loop) %>%
drop_na()
crashes_split_xgb <- initial_split(crashes_xgb, prop = 0.75)
crashes_train_xbg <- training(crashes_split_xgb)
crashes_test_xbg <- testing(crashes_split_xgb)
sparse_matrix_train <- sparse.model.matrix(Severity ~ .-1, data = crashes_train_xbg)
sparse_matrix_test <- sparse.model.matrix(Severity ~ .-1, data = crashes_test_xbg)
y_train <- as.integer(crashes_train_xbg$Severity) - 1
y_test <- as.integer(crashes_test_xbg$Severity) - 1
xgb_train <- xgb.DMatrix(data = sparse_matrix_train, label = y_train)
xgb_test <- xgb.DMatrix(data = sparse_matrix_test, label = y_test)
xgb <- xgboost(data = xgb_train,
eta = 0.1,
max_depth = 15,
nround=15,
subsample = 0.5,
colsample_bytree = 0.5,
seed = 1,
eval_metric = "merror",
objective = "multi:softprob",
num_class = 12,
nthread = 3
)
## [1] train-merror:0.121129
## [2] train-merror:0.119005
## [3] train-merror:0.116654
## [4] train-merror:0.116310
## [5] train-merror:0.115802
## [6] train-merror:0.114656
## [7] train-merror:0.113980
## [8] train-merror:0.113268
## [9] train-merror:0.112950
## [10] train-merror:0.112491
## [11] train-merror:0.112443
## [12] train-merror:0.112142
## [13] train-merror:0.111790
## [14] train-merror:0.111420
## [15] train-merror:0.110877
y_pred <- predict(xgb, newdata = xgb_train)
print(length(y_pred))
## [1] 8492208
print(head(y_pred))
## [1] 0.03920371 0.64076769 0.05910405 0.04018567 0.02759238 0.02759238
err <- mean(as.numeric(y_pred > 0.5))
print(paste("test-error=", err))
## [1] "test-error= 0.0724792656986263"
tree_plot <- xgb.plot.tree(model = xgb, trees = 1, feature.keep = 3)
tree_plot
importance_matrix <- xgb.importance(model = xgb)
xgb.plot.importance(importance_matrix)

importance_matrix